{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/jack/opt/anaconda3/envs/tf_2_0/lib/python3.7/site-packages/lightgbm/__init__.py:48: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n",
      "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
      "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
      "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
      "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
     ]
    }
   ],
   "source": [
    "from deeptables.models import deeptable,deepnets\n",
    "from deeptables.utils import consts,dt_logging,batch_trainer\n",
    "from deeptables.datasets import dsutils\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import roc_auc_score, roc_curve\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Base dir:/Users/jack/opt/anaconda3/envs/tf_2_0/lib/python3.7/site-packages/deeptables/datasets\n"
     ]
    }
   ],
   "source": [
    "data = dsutils.load_adult()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 class detected, {' <=50K', ' >50K'}, so inferred as a [binary classification] task\n",
      "Start training DT model.['dnn_nets', 'fm_nets']\n",
      "train metrics:['AUC']\n",
      "eval metrics:['AUC', 'accuracy', 'recall', 'precision', 'f1']\n",
      "Fitting model...\n",
      "Start cross validation\n",
      "2 class detected, {' <=50K', ' >50K'}, so inferred as a [binary classification] task\n",
      "Preparing features cost:0.060250282287597656\n",
      "Imputation cost:0.11064696311950684\n",
      "Categorical encoding cost:0.18018293380737305\n",
      "Discretization cost:0.07815718650817871\n",
      "fit_transform cost:0.4662480354309082\n",
      "transform X_eval\n",
      "transform_X cost:0.8127491474151611\n",
      "Iterators:StratifiedKFold(n_splits=2, random_state=9527, shuffle=True)\n",
      "Injected a callback [EarlyStopping]. monitor:val_AUC, patience:1, mode:max\n",
      "\n",
      "Fold:1\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>>>>>>>>>>>>>>>>>>>>> Model Desc <<<<<<<<<<<<<<<<<<<<<<< \n",
      "---------------------------------------------------------\n",
      "inputs:\n",
      "---------------------------------------------------------\n",
      "['all_categorical_vars: (15)', 'input_continuous_all: (6)']\n",
      "---------------------------------------------------------\n",
      "embeddings:\n",
      "---------------------------------------------------------\n",
      "input_dims: [11, 18, 9, 17, 8, 7, 4, 43, 18, 5, 14, 5, 5, 5, 4]\n",
      "output_dims: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]\n",
      "dropout: 0.3\n",
      "---------------------------------------------------------\n",
      "dense: dropout: 0\n",
      "batch_normalization: False\n",
      "---------------------------------------------------------\n",
      "concat_embed_dense: shape: (None, 66)\n",
      "---------------------------------------------------------\n",
      "nets: ['dnn_nets']\n",
      "---------------------------------------------------------\n",
      "dnn: input_shape (None, 66), output_shape (None, 64)\n",
      "---------------------------------------------------------\n",
      "stacking_op: add\n",
      "---------------------------------------------------------\n",
      "output: activation: sigmoid, output_shape: (None, 1), use_bias: True\n",
      "loss: binary_crossentropy\n",
      "optimizer: Adam\n",
      "---------------------------------------------------------\n",
      "\n",
      "Train on 13024 samples, validate on 13024 samples\n",
      "13024/13024 [==============================] - 15s 1ms/sample - loss: 0.3796 - AUC: 0.8595 - val_loss: 0.6363 - val_AUC: 0.8306\n",
      "Fold 1 fitting over.\n",
      "Fold 1 scoring over.\n",
      "Save model to:dt_output/dt_20200331 182816_dnn_nets/dnn_nets-kfold-1.h5.\n",
      "\n",
      "Fold:2\n",
      "\n",
      ">>>>>>>>>>>>>>>>>>>>>> Model Desc <<<<<<<<<<<<<<<<<<<<<<< \n",
      "---------------------------------------------------------\n",
      "inputs:\n",
      "---------------------------------------------------------\n",
      "['all_categorical_vars: (15)', 'input_continuous_all: (6)']\n",
      "---------------------------------------------------------\n",
      "embeddings:\n",
      "---------------------------------------------------------\n",
      "input_dims: [11, 18, 9, 17, 8, 7, 4, 43, 18, 5, 14, 5, 5, 5, 4]\n",
      "output_dims: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]\n",
      "dropout: 0.3\n",
      "---------------------------------------------------------\n",
      "dense: dropout: 0\n",
      "batch_normalization: False\n",
      "---------------------------------------------------------\n",
      "concat_embed_dense: shape: (None, 66)\n",
      "---------------------------------------------------------\n",
      "nets: ['dnn_nets']\n",
      "---------------------------------------------------------\n",
      "dnn: input_shape (None, 66), output_shape (None, 64)\n",
      "---------------------------------------------------------\n",
      "stacking_op: add\n",
      "---------------------------------------------------------\n",
      "output: activation: sigmoid, output_shape: (None, 1), use_bias: True\n",
      "loss: binary_crossentropy\n",
      "optimizer: Adam\n",
      "---------------------------------------------------------\n",
      "\n",
      "Train on 13024 samples, validate on 13024 samples\n",
      "13024/13024 [==============================] - 14s 1ms/sample - loss: 0.3915 - AUC: 0.8498 - val_loss: 0.6754 - val_AUC: 0.8283\n",
      "Fold 2 fitting over.\n",
      "Fold 2 scoring over.\n",
      "Save model to:dt_output/dt_20200331 182816_dnn_nets/dnn_nets-kfold-2.h5.\n",
      "fit_cross_validation cost:37.16428899765015\n",
      "Scoring...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   35.9s finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "------------OOF------------ score:\n",
      "{'auc': 0.8276811406126862, 'accuracy': 0.5446099508599509, 'recall': 0.9264542716753844, 'precision': 0.3389781360552108, 'f1': 0.49634850543478265}\n",
      "\n",
      "------------CV------------ Eval score:\n",
      "{'auc': 0.826381044313037, 'accuracy': 0.5330876708122217, 'recall': 0.922976501305483, 'precision': 0.3260318192298824, 'f1': 0.48185380814448797}\n",
      "DT finished.\n",
      "DT - ['dnn_nets', 'fm_nets'] - done in 38s\n",
      "----------------------------------------------------------\n",
      "\n",
      "Start training DT model.['dnn_nets', 'fm_nets', 'cross_nets']\n",
      "train metrics:['AUC']\n",
      "eval metrics:['AUC', 'accuracy', 'recall', 'precision', 'f1']\n",
      "Fitting model...\n",
      "Start cross validation\n",
      "2 class detected, {' <=50K', ' >50K'}, so inferred as a [binary classification] task\n",
      "Preparing features cost:0.03995394706726074\n",
      "Imputation cost:0.10024118423461914\n",
      "Categorical encoding cost:0.18143486976623535\n",
      "Discretization cost:0.07245016098022461\n",
      "fit_transform cost:0.42642879486083984\n",
      "transform X_eval\n",
      "transform_X cost:1.0402209758758545\n",
      "Iterators:StratifiedKFold(n_splits=2, random_state=9527, shuffle=True)\n",
      "Injected a callback [EarlyStopping]. monitor:val_AUC, patience:1, mode:max\n",
      "\n",
      "Fold:1\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>>>>>>>>>>>>>>>>>>>>> Model Desc <<<<<<<<<<<<<<<<<<<<<<< \n",
      "---------------------------------------------------------\n",
      "inputs:\n",
      "---------------------------------------------------------\n",
      "['all_categorical_vars: (15)', 'input_continuous_all: (6)']\n",
      "---------------------------------------------------------\n",
      "embeddings:\n",
      "---------------------------------------------------------\n",
      "input_dims: [11, 18, 9, 17, 8, 7, 4, 43, 18, 5, 14, 5, 5, 5, 4]\n",
      "output_dims: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]\n",
      "dropout: 0.3\n",
      "---------------------------------------------------------\n",
      "dense: dropout: 0\n",
      "batch_normalization: False\n",
      "---------------------------------------------------------\n",
      "concat_embed_dense: shape: (None, 66)\n",
      "---------------------------------------------------------\n",
      "nets: ['dnn_nets']\n",
      "---------------------------------------------------------\n",
      "dnn: input_shape (None, 66), output_shape (None, 64)\n",
      "---------------------------------------------------------\n",
      "stacking_op: add\n",
      "---------------------------------------------------------\n",
      "output: activation: sigmoid, output_shape: (None, 1), use_bias: True\n",
      "loss: binary_crossentropy\n",
      "optimizer: Adam\n",
      "---------------------------------------------------------\n",
      "\n",
      "Train on 13024 samples, validate on 13024 samples\n",
      "13024/13024 [==============================] - 15s 1ms/sample - loss: 0.4114 - AUC: 0.8346 - val_loss: 0.6052 - val_AUC: 0.8333\n",
      "Fold 1 fitting over.\n",
      "Fold 1 scoring over.\n",
      "Save model to:dt_output/dt_20200331 182854_dnn_nets/dnn_nets-kfold-1.h5.\n",
      "\n",
      "Fold:2\n",
      "\n",
      ">>>>>>>>>>>>>>>>>>>>>> Model Desc <<<<<<<<<<<<<<<<<<<<<<< \n",
      "---------------------------------------------------------\n",
      "inputs:\n",
      "---------------------------------------------------------\n",
      "['all_categorical_vars: (15)', 'input_continuous_all: (6)']\n",
      "---------------------------------------------------------\n",
      "embeddings:\n",
      "---------------------------------------------------------\n",
      "input_dims: [11, 18, 9, 17, 8, 7, 4, 43, 18, 5, 14, 5, 5, 5, 4]\n",
      "output_dims: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]\n",
      "dropout: 0.3\n",
      "---------------------------------------------------------\n",
      "dense: dropout: 0\n",
      "batch_normalization: False\n",
      "---------------------------------------------------------\n",
      "concat_embed_dense: shape: (None, 66)\n",
      "---------------------------------------------------------\n",
      "nets: ['dnn_nets']\n",
      "---------------------------------------------------------\n",
      "dnn: input_shape (None, 66), output_shape (None, 64)\n",
      "---------------------------------------------------------\n",
      "stacking_op: add\n",
      "---------------------------------------------------------\n",
      "output: activation: sigmoid, output_shape: (None, 1), use_bias: True\n",
      "loss: binary_crossentropy\n",
      "optimizer: Adam\n",
      "---------------------------------------------------------\n",
      "\n",
      "Train on 13024 samples, validate on 13024 samples\n",
      "13024/13024 [==============================] - 19s 1ms/sample - loss: 0.3839 - AUC: 0.8557 - val_loss: 0.7234 - val_AUC: 0.8428\n",
      "Fold 2 fitting over.\n",
      "Fold 2 scoring over.\n",
      "Save model to:dt_output/dt_20200331 182854_dnn_nets/dnn_nets-kfold-2.h5.\n",
      "fit_cross_validation cost:42.4756920337677\n",
      "Scoring...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   41.0s finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "------------OOF------------ score:\n",
      "{'auc': 0.805177582509604, 'accuracy': 0.5677595208845209, 'recall': 0.9205896338563956, 'precision': 0.350597609561753, 'f1': 0.5078032786885246}\n",
      "\n",
      "------------CV------------ Eval score:\n",
      "{'auc': 0.834870615388083, 'accuracy': 0.5340089052663903, 'recall': 0.9301566579634465, 'precision': 0.32736044107512063, 'f1': 0.48428207306711984}\n",
      "DT finished.\n",
      "DT - ['dnn_nets', 'fm_nets', 'cross_nets'] - done in 43s\n",
      "----------------------------------------------------------\n",
      "\n"
     ]
    }
   ],
   "source": [
    "conf = deeptable.ModelConfig(\n",
    "    auto_discrete=True,\n",
    "    auto_categorize=True,\n",
    "    cat_exponent=0.4,\n",
    "    cat_remain_numeric=True,\n",
    "    metrics=['AUC']\n",
    ")\n",
    "\n",
    "bt = batch_trainer.BatchTrainer(data, 'x_14',\n",
    "                                eval_size=0.2,\n",
    "                                validation_size=0.2,\n",
    "                                eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],\n",
    "                                verbose=1,\n",
    "                                dt_epochs=1,\n",
    "                                dt_config=conf,\n",
    "                                dt_nets=[['dnn_nets','fm_nets'],['dnn_nets','fm_nets','cross_nets']],\n",
    "                                cross_validation=True,\n",
    "                                num_folds=2,\n",
    "                                )\n",
    "ms = bt.start()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model</th>\n",
       "      <th>type</th>\n",
       "      <th>*auc</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>recall</th>\n",
       "      <th>precision</th>\n",
       "      <th>f1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>conf-1 - ['dnn_nets', 'fm_nets', 'cross_nets']...</td>\n",
       "      <td>cv-eval</td>\n",
       "      <td>0.834871</td>\n",
       "      <td>0.534009</td>\n",
       "      <td>0.930157</td>\n",
       "      <td>0.327360</td>\n",
       "      <td>0.484282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>conf-1 - ['dnn_nets', 'fm_nets'] - CV - oof</td>\n",
       "      <td>oof</td>\n",
       "      <td>0.827681</td>\n",
       "      <td>0.544610</td>\n",
       "      <td>0.926454</td>\n",
       "      <td>0.338978</td>\n",
       "      <td>0.496349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>conf-1 - ['dnn_nets', 'fm_nets'] - CV - eval</td>\n",
       "      <td>cv-eval</td>\n",
       "      <td>0.826381</td>\n",
       "      <td>0.533088</td>\n",
       "      <td>0.922977</td>\n",
       "      <td>0.326032</td>\n",
       "      <td>0.481854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>conf-1 - ['dnn_nets', 'fm_nets', 'cross_nets']...</td>\n",
       "      <td>oof</td>\n",
       "      <td>0.805178</td>\n",
       "      <td>0.567760</td>\n",
       "      <td>0.920590</td>\n",
       "      <td>0.350598</td>\n",
       "      <td>0.507803</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               model     type      *auc  \\\n",
       "0  conf-1 - ['dnn_nets', 'fm_nets', 'cross_nets']...  cv-eval  0.834871   \n",
       "1        conf-1 - ['dnn_nets', 'fm_nets'] - CV - oof      oof  0.827681   \n",
       "2       conf-1 - ['dnn_nets', 'fm_nets'] - CV - eval  cv-eval  0.826381   \n",
       "3  conf-1 - ['dnn_nets', 'fm_nets', 'cross_nets']...      oof  0.805178   \n",
       "\n",
       "   accuracy    recall  precision        f1  \n",
       "0  0.534009  0.930157   0.327360  0.484282  \n",
       "1  0.544610  0.926454   0.338978  0.496349  \n",
       "2  0.533088  0.922977   0.326032  0.481854  \n",
       "3  0.567760  0.920590   0.350598  0.507803  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ms.leaderboard(top=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
